Introduction

Basic data visualization of beatmap info provided by osu! API. https://github.com/ppy/osu-api/wiki

Only uses ranked/loved/qualified maps. Graphs focusing on standard mode.

(A few maps, due to ranked/loved irregularities, are actually in graveyard but included in the API query anyway. Ex. https://osu.ppy.sh/b/766190&m=2 with 1 loved CtB diff. )

library(ggplot2)
library(gridExtra)
library(plyr)
library(jsonlite)
library(varhandle)
library(chron)
library(magrittr)
# Cached chunk
# Import data from JSON and remove duplicate rows
beatmaps <- unique(do.call("rbind", fromJSON("maps.json")))
# Set global figure width and height
knitr::opts_chunk$set(fig.width=10, fig.height=6) 
# Disable warnings and messages
knitr::opts_chunk$set(message = FALSE, warning = FALSE)

# Convert strings of ints and floats to numeric datatypes
beatmaps.isintcol <- sapply(beatmaps, function(col) all(check.numeric(col, only.integer=TRUE))) 
beatmaps.isnumcol <- sapply(beatmaps, function(col) all(check.numeric(col))) & !beatmaps.isintcol
beatmaps[, beatmaps.isintcol] = sapply(beatmaps[, beatmaps.isintcol], as.integer)
beatmaps[, beatmaps.isnumcol] = sapply(beatmaps[, beatmaps.isnumcol], as.numeric)

# Convert MySQL datetimes to R datetimes. As far as the new site goes, this appears to be UTC-4, but
# this may be a local thing. Probably broken. 
beatmaps$approved_date <- as.POSIXct(beatmaps$approved_date, tz="Etc/GMT+4")
beatmaps$last_update <- as.POSIXct(beatmaps$last_update, tz="Etc/GMT+4")


# Create labels and data frames for each gamemode
gamemodes <- c("std", "taiko", "ctb", "mania")
gamemode.labels <- c("Standard", "Taiko", "CtB", "Mania")
beatmaps$mode <- factor(beatmaps$mode, labels=gamemode.labels)
for (i in 1:4) {
  assign(gamemodes[i], beatmaps[beatmaps$mode == gamemode.labels[i],])
}

# Various plot parameters for convenience
# These will usually leave a few outlier maps out
AR_y_scale          <- scale_y_continuous(breaks=seq(0,10))
SR_y_scale          <- scale_y_continuous(limits=c(0,10), breaks=seq(0,10,1))
approved_x_scale    <- scale_x_datetime(date_breaks="1 year", date_labels="%Y")

legend_title_fill   <- labs(fill="Mode")
legend_title_color  <- labs(color="Mode")

# Center titles
theme_update(plot.title = element_text(hjust = 0.5))

Plots

Star Rating

diff_x_scale <- scale_x_continuous(limits=c(0,10), breaks=seq(0,10,0.5))
diff_hist <- geom_histogram(binwidth=0.05)

# Histogram of star rating (all modes)
ggplot(beatmaps, aes(difficultyrating, fill=as.factor(mode))) + 
  ggtitle("Total Star Rating (All Modes)") + legend_title_fill +
  diff_x_scale + diff_hist

# Frequency polygon of SR (all modes)
ggplot(beatmaps, aes(difficultyrating, color=as.factor(mode))) + 
  ggtitle("Star Rating (All Modes)") + legend_title_color +
  diff_x_scale + geom_freqpoly(binwidth=0.05) 

# Histograms of SR (all modes) with separate y scales
diffplots <- llply(1:4, function(m) {
  ggplot(beatmaps[as.numeric(beatmaps$mode) == m, ], aes(difficultyrating)) +
    diff_x_scale + diff_hist + 
    ggtitle(gamemode.labels[m])
})
grid.arrange(grobs = diffplots, top="Star Rating Distributions (All Modes)")

The ranking criteria until recently required non-marathon maps to have a difficulty Normal or below. According to the new criteria, any map with less than 3:30 drain time requires a Normal or below. This explains the large quantity of maps between 1* and 2*.

After these Easy and Normal maps, the most popular standard maps are between 3* and 4*. There is a small but noticeable spike in standard maps at about 5.25*.

Taiko stands out for having the most unimodal-looking distribution.

Total Length

length_x_scale <- scale_x_continuous(limits=c(0,600), breaks=seq(0,600,30))
length_hist <- geom_histogram(binwidth=1)
x_labels_90 <- theme(axis.text.x = element_text(angle=90, hjust=1))

# Histogram of total length (all modes)
ggplot(beatmaps, aes(total_length, fill=as.factor(mode))) + 
  ggtitle("Total Beatmap Length (All Modes)") + legend_title_fill +
  length_x_scale + length_hist

# Frequency polygon of total length (all modes)
ggplot(beatmaps, aes(total_length, color=as.factor(mode))) + 
  ggtitle("Beatmap Length (All Modes)") + legend_title_color +
  length_x_scale + geom_freqpoly(binwidth=1)

# Histograms of total length (all modes)
lengthplots <- llply(1:4, function(m) {
  ggplot(beatmaps[as.numeric(beatmaps$mode) == m, ], aes(total_length)) +
    length_hist + length_x_scale + x_labels_90 + 
    ggtitle(gamemode.labels[m])
})

grid.arrange(grobs=lengthplots, top="Beatmap Length Distributions (All Modes)")

The massive spike in maps 85-90 seconds long corresponds to the ever-popular TV Size map genre.

This occurs in every gamemode, though mania and taiko have relatively large numbers of two minute maps.

Playcount

# Frequency polygons of playcount (all modes)
ggplot(beatmaps, aes(playcount, color=as.factor(mode))) + 
  ggtitle("Playcount (All Modes)") + legend_title_color +
  scale_x_continuous(limits=c(0,1000000)) + 
  geom_freqpoly(binwidth=5000)

Date approved

beatmaps$month <- cut(beatmaps$approved_date, breaks="month")
year_x_scale <- scale_x_discrete(breaks=unique(cut(beatmaps$approved_date, breaks="year"))) 
x_labels_45 <- theme(axis.text.x = element_text(angle=45, hjust=1))

# Bar chart of date approved (all modes)
ggplot(beatmaps, aes(x=month, fill=mode)) + 
  ggtitle("Date Approved (All Modes)") +
  geom_bar(width=1) + 
  year_x_scale + x_labels_45

# Frequency polygon of date approved (all modes)
ggplot(beatmaps, aes(x=month, group=mode, color=mode)) +
  ggtitle("Date Approved (All Modes)") + 
  legend_title_color +
  geom_freqpoly(stat="count") +
  year_x_scale + x_labels_45

Tables

Most frequent artists, titles, sources, and creators

library(knitr)
most.frequent.kable <- function(arr, lab) {
  # Neat use of pipe
  arr %>% table %>% sort(decreasing=TRUE) %>% head(20) %>% kable(col.names=c(lab, "Freq"))
}

beatmaps$artist  %>% most.frequent.kable("Artist")
Artist Freq
Hatsune Miku 653
ClariS 421
KOTOKO 398
fripSide 362
xi 349
senya 339
IOSYS 331
Various Artists 329
LiSA 323
yanaginagi 311
Camellia 309
ZUN 272
Duca 257
Rita 255
M2U 245
Chata 226
u’s 225
Hanatan 224
nano 224
Suzuki Konomi 211
beatmaps$title   %>% most.frequent.kable("Title")
Title Freq
Piano 7K BMS Pack 193
Piano Beatmap Set 114
Harumachi Clover 70
Granat 65
Ai no Scenario 62
PEPPY FIX TAIKO STAR RATING PLEASE for a happier 61
Tokyo (Innovaderz Remix) 61
MIIRO 58
Hitorigoto -TV MIX- 55
Re:TrymenT 54
Haru Modoki 52
Natsukoi Hanabi 52
TSLove 52
Contrail Kiseki 51
Paradisus-Paradoxum 51
Untan Goose 51
Uso no Hibana 50
Six Trillion Years and Overnight Story 49
Gabriel Drop Kick 43
Oriental Blossom 42
beatmaps$source  %>% most.frequent.kable("Source")
Source Freq
24652
Touhou 2370
BMS 1557
東方Project 754
SOUND VOLTEX III GRAVITY WARS 466
DJMAX 455
beatmania IIDX 398
SOUND VOLTEX II -infinite infection- 390
osu! 258
Vocaloid 242
Taiko no Tatsujin 222
Deemo 202
jubeat 187
Nico Nico Douga 178
Love Live! School idol project 174
SOUND VOLTEX BOOTH 169
REFLEC BEAT groovin’!! 166
Cytus 164
K-ON!! 153
艦隊これくしょん -艦これ- 146
beatmaps$creator %>% most.frequent.kable("Creator")
Creator Freq
osuplayer111 572
Sotarks 571
DJPop 563
tutuhaha 387
ztrot 377
Larto 354
Natsu 345
Monstrata 331
Ascendance 311
pishifat 311
Lasse 310
Gero 304
wcx19911123 293
Milan- 279
ouranhshc 262
alacat 248
NatsumeRin 247
MoonFragrance 234
James 232
Fycho 221

Most favorited mapsets

beatmaps %>% 
  subset(!duplicated(beatmaps$beatmapset_id)) %>%  # Keep rows with unique beatmapset_id
  arrange(desc(favourite_count)) %>% 
  head(50) %>% `[`(c("creator", "artist", "title", "favourite_count")) %>% kable
creator artist title favourite_count
W h i t e Kuba Oms My Love 13163
Fort Panda Eyes & Teminite Highscore 10292
jonathanlfj cYsmix feat. Emmy Tear Rain 9763
Charles445 Rostik Liquid (Paul Rosenthal Remix) 8282
VINXIS Reol No title 7604
Ekoro UNDEAD CORPORATION Everything will freeze 6886
Kuria Linked Horizon Guren no Yumiya (TV Size) 6880
Doormat ClariS Hitorigoto -TV MIX- 6606
Awaken Konuko Toumei Elegy 6534
Voltaeyx TheFatRat Mayday (feat. Laura Brehm) 6335
Takuya S3RL Pika Girl 5510
Bearizm Station Earth Cold Green Eyes ft. Roos Denayer 5425
Saten-san Yousei Teikoku Kokou no Sousei 5300
gowww Hatsune Miku & Megpoid Gumi MATRYOSHKA 4878
h3k1ru Yiruma & Skullee River Flows In You (A Love Note) 4849
ouranhshc Masayoshi Minoshima feat. nomico Bad Apple!! 4829
-kevincela- Rameses B Flaklypa 4829
ktgster Chasers Lost 4770
Sekai-nyan Suzuki Konomi This game (TV Size) 4618
Kuria ONE OK ROCK Answer is Near 4591
eLy Feint Tower Of Heaven (You Are Slaves) 4461
Secretpipe S3RL Bass Slut (Original Mix) 4399
Monstrata RADWIMPS Zen Zen Zense (movie ver.) 4352
Natsu Hanatan Airman ga Taosenai (SOUND HOLIC Ver.) 4328
NatsumeRin Hatsune Miku Senbonzakura (Short Ver.) 3966
kristi71111 TK from Ling tosite sigure unravel (TV edit) 3942
Rue DJ Genericname Dear You 3886
Monstrata Porter Robinson & Madeon Shelter 3860
Asphyxia xi Blue Zenith 3849
Kagetsu KANA-BOON Silhouette 3839
osuplayer111 Getter Jaani Rockefeller Street (Nightcore Mix) 3703
Smoothie UNDEAD CORPORATION Yoru Naku Usagi wa Yume o Miru 3677
Star Stream Wotamin Gigantic O.T.N 3627
Kyshiro toby fox MEGALOVANIA 3613
Blue Dragon The Quick Brown Fox The Big Black 3403
RLC Himeringo Yotsuya-san ni Yoroshiku 3377
Multiple Creators Soleily Renatus 3345
Sherry Nanahira Frightfully-insane Flan-chan’s frightful song 3327
jonathanlfj Reol Plus Danshi ver Reol 3324
Len Shawn Wasabi Marble Soda 3288
Ephemeral Masayoshi Minoshima ft. nomico Bad Apple!! 3219
Tarrasky Agnete Kjolsrud Get Jinxed 3165
Guy Aoi Eir IGNITE (TV size ver.) 3119
AllStar12 yuikonnu & ayaponzu* Super Nuko World 3033
Garven Saiya Remote Control 3028
handsome Reol MONSTER 3009
Gaia Reol Asymmetry 2993
rui Hatsune Miku Rubik’s Cube 2967
Monstrata 9mm Parabellum Bullet Inferno 2959
Jacob NOMA Brain Power 2908

Most played maps

beatmaps %>%
  arrange(desc(playcount)) %>% 
  head(50) %>% `[`(c("creator", "artist", "title", "version", "playcount")) %>% kable
creator artist title version playcount
W h i t e Kuba Oms My Love Hard 21429513
W h i t e Kuba Oms My Love Normal 20405157
jonathanlfj cYsmix feat. Emmy Tear Rain Normal 18188427
Blue Dragon The Quick Brown Fox The Big Black WHO’S AFRAID OF THE BIG BLACK 14209863
jonathanlfj cYsmix feat. Emmy Tear Rain Hard 12840626
ktgster Chasers Lost Normal 12456963
-kevincela- Rameses B Flaklypa Normal 12124613
Charles445 Rostik Liquid (Paul Rosenthal Remix) Easy 10881187
Blue Dragon Team Nekokan Can’t Defeat Airman Holy Shit! It’s Airman!! 10878775
-kevincela- Rameses B Flaklypa Hard 10611576
W h i t e Kuba Oms My Love Insane 9287944
Multiple Creators Soleily Renatus Normal 8065171
jonathanlfj cYsmix feat. Emmy Tear Rain Insane 8028482
ktgster Chasers Lost Hard 7715600
Rue DJ Genericname Dear You Dear Rue 7548246
Bearizm Station Earth Cold Green Eyes ft. Roos Denayer Divine 7343941
h3k1ru Yiruma & Skullee River Flows In You (A Love Note) Love Note 7328010
VINXIS Reol No title Light Insane 6820213
Charles445 Rostik Liquid (Paul Rosenthal Remix) Normal 6818829
val0108 Lily Scarlet Rose 0108 style 6684374
Charles445 Rostik Liquid (Paul Rosenthal Remix) Hard 6659517
Ekoro UNDEAD CORPORATION Everything will freeze Insane 6604128
Reikin Nico Nico Douga U.N. Owen Was Her? Normal 6495851
Fort Panda Eyes & Teminite Highscore Another 6442871
Fort Panda Eyes & Teminite Highscore LGV’s Insane 6255646
Lust Tsunamaru Daidai Genome Insane 6182514
Takuya S3RL Pika Girl Hard 5910259
Saten-san Yousei Teikoku Kokou no Sousei Hard 5760621
Nakagawa-Kanon xi FREEDOM DiVE Another 5520280
Kuria Linked Horizon Guren no Yumiya (TV Size) DS’s Hard 5517593
Flask Fujijo Seitokai Shikkou-bu Best FriendS -TV Size- Fycho’s Insane 5498716
VINXIS Reol No title byfaR’s Hard 5419243
Garven Saiya Remote Control Insane 5395495
xxdeathx FLOWxGRANRODEO 7 -seven- -TV SIZE - Expert 5357990
Damnae raja the light Normal 5289914
Kuria Linked Horizon Guren no Yumiya (TV Size) alacat’s Normal 5244344
Fort Panda Eyes & Teminite Highscore Hyper 5227173
Doormat ClariS Hitorigoto -TV MIX- Insane 5173748
Bearizm Station Earth Cold Green Eyes ft. Roos Denayer apple’s Insane 5172218
galvenize DJ Fresh Gold Dust Insane 5150833
Taeyang kradness&Reol Remote Control Max Control! 5141365
Luerxa Primastella Koigokoro Delis’ Insane 5138932
JauiPlaY DJ Okawari Flower Dance Flower 5138709
Multiple Creators Soleily Renatus Hard 5085398
val0108 Hatsune Miku Mythologia’s End Myth0108ia 5076064
Takuya S3RL Pika Girl Insane 5024386
Natsu Hanatan Airman ga Taosenai (SOUND HOLIC Ver.) Insane 4916733
Star Stream Sagara Kokoro Hoshizora no Ima S.S 4879941
Nakagawa-Kanon xi FREEDOM DiVE FOUR DIMENSIONS 4753427
Kuria Linked Horizon Guren no Yumiya (TV Size) Insane 4679371

Scatterplots

# Scatterplot of AR vs BPM
ggplot(std, aes(bpm, diff_approach)) + 
  ggtitle("Approach Rate vs BPM") + 
  scale_x_continuous(limits=c(0,500)) + 
  AR_y_scale + 
  geom_point(alpha=0.1)

# Scatterplot of SR vs total length time
ggplot(std, aes(total_length, difficultyrating)) + 
  ggtitle("Star Rating vs Total Length") +
  length_x_scale + 
  SR_y_scale +
  geom_point(alpha=0.1)

# Scatterplot of max combo vs drain time
ggplot(std, aes(hit_length, max_combo)) + 
  ggtitle("Max Combo vs Drain Time") + 
  length_x_scale + 
  scale_y_continuous(limits=c(0,4000)) +
  geom_point(alpha=0.05)

# High linear correlation, as expected
summary(lm(max_combo ~ hit_length, data=std))
## 
## Call:
## lm(formula = max_combo ~ hit_length, data = std)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3860.7  -134.3   -17.4   120.5 23020.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99.20894    2.51272  -39.48   <2e-16 ***
## hit_length    4.89613    0.01715  285.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 272.8 on 56923 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.5888, Adjusted R-squared:  0.5888 
## F-statistic: 8.152e+04 on 1 and 56923 DF,  p-value: < 2.2e-16
# Scatterplot of favorite count vs playcount
ggplot(std, aes(playcount, favourite_count)) + 
  ggtitle("Favorite Count vs Playcount") + 
  scale_x_continuous(limits=c(0,1000000)) +
  scale_y_continuous(limits=c(0,1000)) +
  geom_point(alpha=0.05)

# Scatterplot of playcount vs total length
ggplot(std, aes(total_length, playcount)) +
  ggtitle("Playcount vs Total Length") + 
  length_x_scale + 
  scale_y_continuous(limits=c(0,1000000)) +
  geom_point(alpha=0.1)

# Scatterplot of AR vs date approved
ggplot(std, aes(approved_date, diff_approach)) + 
  ggtitle("Approach Rate vs Date Approved") + 
  AR_y_scale + 
  approved_x_scale + 
  geom_point(alpha=0.05)

# Scatterplot of SR vs date approved
ggplot(std, aes(approved_date, difficultyrating)) +
  ggtitle("Star Rating vs Date Approved") +
  SR_y_scale + 
  approved_x_scale + 
  geom_point(alpha=0.1)

Spread info

# Playcount by song time, categorized by spread icon
# https://osu.ppy.sh/help/wiki/Difficulties#star-rating Not sure about values between boundaries

spread.sr = c(0, 1.51, 2.26, 3.76, 5.26, 6.76)  
spread.names = c("Easy", "Normal", "Hard", "Insane", "Expert", "Expert+")
spread.colors = c("olivedrab3", "paleturquoise", "gold", "hotpink", "purple", "darkgray")

# Assign difficulty rating by spread ranges to spread names 
beatmaps$spread_name <- spread.names[cut(beatmaps$difficultyrating, spread.sr, right=FALSE, labels=FALSE)]
std <- beatmaps[beatmaps$mode == "Standard",]  # update std 

hitlength.bins = seq(0, 360, 30)
par(mfrow=c(2,3), mar=c(4,4,4,1), cex.main=2)
for (i in 1:length(spread.names)) {
  std.spread = std[std$spread_name == spread.names[i], ]
  playcount.bin.sum = sapply(split(std.spread, cut(std.spread$hit_length, hitlength.bins)),
                             function(df) sum(df$playcount))

  barplot(playcount.bin.sum, space=0, width=30, xlab="Hit length (s)", ylab="Playcount Total", main=spread.names[i],
          col=spread.colors[i], axisnames=FALSE)
  axis(1, at=hitlength.bins)
}

# Same but 150+ hitlength and stacked bars
hitlength.bins.150 = seq(150, 360, 30)
playcount.bin.mat = matrix(ncol=length(hitlength.bins.150)-1, nrow=length(spread.names))
colnames(playcount.bin.mat) = head(hitlength.bins.150, -1)
rownames(playcount.bin.mat) = spread.names

for (i in 1:nrow(playcount.bin.mat)) {
  std.spread = std[std$spread_name == spread.names[i], ]
  playcount.bin.mat[i,] = sapply(split(std.spread, cut(std.spread$hit_length, hitlength.bins.150)),
                                 function(df) sum(df$playcount))
}

dev.off()  # Reset par 
## null device 
##           1
barplot(playcount.bin.mat, space=0, width=30, col=spread.colors, xlab="Hitlength (s)", ylab="Total Playcount",
        legend.text=spread.names, axisnames=FALSE, main="Total Playcount by Hitlength and Difficulty")
axis(1, at=hitlength.bins.150-hitlength.bins.150[1], labels=hitlength.bins.150)